Note: Raw data can be found in the National Database for Autism Research (NDAR) under the accession code NDARCOL0002034 but you need to request access
Data preprocessing briefly described in Transcriptome analysis of cortical tissue reveals shared sets of downregulated genes in autism and schizophrenia
Note: The rownames in the expression dataset have the following structure: ensemble ID + ‘.’ + gene name + n. I have to find what the n stands for (just different measurements of the gene? polymorphisms? other?)
# LOAD METADATA
datMeta = read.delim('./../Data/Gupta/AUT_pheno.txt')
datMeta = datMeta %>% dplyr::select(-matches('PC|SV|MEDIAN|.01'))
datMeta$age_group = cut(datMeta$Age, c(-0.5, 0, 0.6, 10, 20, 30, 40, 50, 60, 70, 80),
labels=c('Fetal','Infant','Child','10-20','20s','30s','40s','50s','60s','70s'))
# LOAD EXPRESSION DATA
datExpr = read.delim('./../Data/Gupta/NormalizedGeneExpression_AUT.txt')
# ANNOTATE GENES
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'),
values=sub('\\..*', '', rownames(datExpr)), mart=mart)
datGenes = datGenes[match(sub('\\..*', '', rownames(datExpr)), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position
datGenes$ID_match_datExpr = paste(datGenes$ensembl_gene_id, gsub('-','.',datGenes$external_gene_id), sep='.')
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv')
# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
# Combine SFARI and GO information
gene_info = data.frame('ID'=sub('\\..*', '', rownames(datExpr))) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))
rm(getinfo, mart, GO_annotations)
RNA-Seq for 104 cortical brain-tissue samples across three brain regions (BA10, BA19 and BA44/45), comprising 57 samples from 40 control subjects and 47 samples from 32 ASD subjects
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$sampleid)), ' different subjects.'))
## [1] "Dataset includes 48216 genes from 104 samples belonging to 72 different subjects."
Diagnosis distribution: Fairly balanced
table(datMeta$Dx)
##
## Autism Control
## 47 57
Brain region distribution: Most samples (~60%) belong to Brodmann area 19
Where: 1=BA10, 2=BA19 and 3=BA44/45
table(datMeta$brainregion)
##
## 1 2 3
## 14 62 28
Diagnosis and brain region are balanced, although not that much
table(datMeta$Dx, datMeta$brainregion)
##
## 1 2 3
## Autism 6 24 17
## Control 8 38 11
Sex distribution: There are three times more Male samples than Female ones
table(datMeta$Gender)
##
## F M
## 24 80
Age distribution: Subjects between 2 and 82 years old with a mean close to 20
summary(datMeta$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 9.00 18.00 20.81 23.25 82.00
Note: There are some NAs in the expression data, but before diciding which genes/samples to remove it would be better to remove low expressed genes (maybe they have many of the NAs)
print(paste0('There are ', sum(is.na(datExpr)), ' NAs in the expression data'))
## [1] "There are 18060 NAs in the expression data"
Seems like there is a really dense concentration of point with mean expression ~ -1 and sd ~ 1.3
mean_expr = data.frame('ID'=rownames(datExpr),'Mean'=rowMeans(datExpr, na.rm=T),
'SD'=apply(datExpr,1,function(x) sd(x, na.rm=T)))
p1 = mean_expr %>% ggplot(aes(Mean, SD)) + geom_point(alpha=0.01, color='#0099cc') + geom_smooth(method='lm', color='gray') +
geom_vline(xintercept=-2.8, color='gray', linetype='dashed') + theme_minimal()
p2 = mean_expr %>% ggplot(aes(Mean)) + geom_density(fill='#0099cc', color='#0099cc', alpha=0.5) +
geom_vline(xintercept=-2.8, color='gray', linetype='dashed') + theme_minimal()
grid.arrange(p1, p2, ncol=2)
rm(mean_expr, p1, p2)
Filtering out genes with a mean expression lower than -2.8
to_keep = rowMeans(datExpr, na.rm=TRUE)>-2.8
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep, na.rm=T), ', ', sum(to_keep, na.rm=T), ' remaining'))
## [1] "Removed 1314, 46902 remaining"
rm(to_keep)
missing_points = is.na(datExpr)
print(paste0('There are ', sum(is.na(datExpr)), ' NAs in the expression data'))
## [1] "There are 17805 NAs in the expression data"
\(\qquad\) 2.1 Filtering samples with the highest number of missing values (chose to remove the top 2 samples, which have over 8 times the average number of missing values per sample)
nas_by_sample = data.frame('ID' = colnames(datExpr),
'NAs' = datExpr %>% apply(2, function(x) x %>% is.na %>% sum))
ggplotly(nas_by_sample %>% ggplot(aes(reorder(ID,-NAs), NAs, fill=NAs)) + geom_bar(stat='identity') +
xlab('Sample') + ylab('NA count') + scale_fill_viridis() + theme_minimal() +
geom_hline(yintercept=sum(is.na(datExpr))/ncol(datExpr), color='gray') + ggtitle('Missing values by sample') +
theme(axis.text.x = element_blank(), axis.ticks = element_blank(), legend.position='none'))
print(paste0('Removing samples ', paste(nas_by_sample$ID[nas_by_sample$NAs>1300], collapse = ' and ')))
## [1] "Removing samples ba19.s31 and ba19.s40"
to_keep = which(!colnames(datExpr) %in% nas_by_sample$ID[nas_by_sample$NAs>1300])
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]
print(paste0('Removed ', length(!to_keep), ' samples, ', length(to_keep), ' remaining'))
## [1] "Removed 102 samples, 102 remaining"
rm(nas_by_sample, to_keep)
\(\qquad\) 2.2 Filter genes with the highest number of missing values (chose to filter the genes with more than 3% missing values (3 or more))
nas_by_gene = data.frame('ID' = rownames(datExpr),
'NAs' = datExpr %>% apply(1, function(x) x %>% is.na %>% sum))
nas_by_gene %>% ggplot(aes(NAs)) + geom_bar(fill='#0099cc') + scale_y_sqrt() + theme_minimal()
to_keep = nas_by_gene$NAs<3
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## [1] "Removed 356 genes, 46546 remaining"
rm(nas_by_gene)
\(\qquad\) 2.3 Impute missing values using the mice (Multiple Imputations by Chained Equations) package
to_keep = apply(datExpr, 2, function(x) sum(is.na(x)))<550
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]
print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 6 samples, 96 remaining"
to_keep = apply(datExpr, 1, function(x) sum(is.na(x)))==0
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## [1] "Removed 8287 genes, 38259 remaining"
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Using \(s_{ij}=|bw(i,j)|\) to define connectivity between genes.
Filtering three samples, all from different subjects but from the Harvard lab
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'Age'=datMeta$age_group)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Age)) + geom_point() + geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: ba10.s23, ba19.s36, ba19.s7"
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 3 samples, 93 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 38259 genes and 93 samples"
Site HM indicates the laboratory the samples belong to (Harvard/Maryland). Diagnosis groups are not balanced between sites!!! Using ComBat for batch correction could erase biological signals related to the diagnosis
table(datMeta$SiteHM, datMeta$Dx)
##
## Autism Control
## H 30 9
## M 11 43
Can’t use svaseq because data is not log counts, so using sva instead
mod = model.matrix(~ Dx, datMeta)
mod0 = model.matrix(~ 1, datMeta)
sva_fit = datExpr %>% as.matrix %>% sva(mod=mod, mod0=mod0)
## Number of significant surrogate variables is: 5
## Iteration (out of 5 ):1 2 3 4 5
rm(mod, mod0)
Include SV estimations to datMeta information
sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV',1:ncol(sv_data))
datMeta = cbind(datMeta, sv_data)
rm(sv_data)
Using lmFit
mod = model.matrix(~ SV1 + SV2 + SV3 + SV4 + SV5 + Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$sampleid)
lmfit = lmFit(datExpr, design=mod, block=datMeta$sampleid, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr)) %>% mutate('ID'=sub('\\..*', '', rownames(.)))
DE_info = gene_info %>% left_join(top_genes, by='ID')
rm(mod, corfit, lmfit, fit, top_genes)
PCA: There doesn’t seem to be a specific pattern related to gender or age, there does seem to be a small differentiation by diagnosis in the 2nd principal component
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by='ID') %>%
dplyr::select('PC1','PC2','Age','age_group','Gender','Dx') %>%
mutate('sqrt_age' = sqrt(Age))
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
rm(pca, plot_data)
First Principal Component explains 92% of the total variance
There’s a really strong (negative) correlation between the mean expression of a gene and the 1st principal component
It seems like the lower expressed points that make a weird turn on the PC2 axis have a smaller PC3 value than the rest (rotate PC1,PC2 axis and it becomes visible). Haven’t found what else characterises them
pca = datExpr %>% prcomp
plot_data = data.frame('PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3],
'MeanExpr'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.2) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
plot_data %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_data$MeanExpr, size=1) %>%
layout(title = 'PCA 3D',
scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca)$importance[2,1]*100,2),'%)')),
yaxis=list(title=glue('PC2 (',round(summary(pca)$importance[2,2]*100,2),'%)')),
zaxis=list(title=glue('PC3 (',round(summary(pca)$importance[2,3]*100,2),'%)'))))
rm(pca, plot_data)
The higher the SFARI score the higher the mean expression but the lower the standard deviation. Same as Gandal’s dataset!
plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr)),
'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(gene_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
The higher the SFARI score the higher the mean expression but the lower the standard deviation. Same as Gandal’s dataset!
plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr)),
'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(DE_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
From scores 1 through 4, the higher the SFARI score, the lower the log Fold Change. Same behaviour as Gandal!
ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(logFC), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal() + theme(legend.position='none'))
## Warning: Removed 9957 rows containing non-finite values (stat_boxplot).
save(datExpr, datMeta, datGenes, DE_info, file='./../Data/Gupta/preprocessed_data.RData')
#load('./../Data/Gupta/preprocessed_data.RData')
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] glue_1.3.1 sva_3.30.1 BiocParallel_1.16.6
## [4] genefilter_1.64.0 mgcv_1.8-26 nlme_3.1-137
## [7] edgeR_3.24.3 limma_3.38.3 Rtsne_0.15
## [10] vsn_3.50.0 Biobase_2.42.0 BiocGenerics_0.28.0
## [13] WGCNA_1.66 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [16] forcats_0.3.0 stringr_1.4.0 dplyr_0.8.0.1
## [19] purrr_0.3.1 readr_1.3.1 tidyr_0.8.3
## [22] tibble_2.1.1 tidyverse_1.2.1 viridis_0.5.1
## [25] viridisLite_0.3.0 gridExtra_2.3 plotlyutils_0.0.0.9000
## [28] plotly_4.8.0 ggplot2_3.1.0 biomaRt_2.38.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.11.2 base64enc_0.1-3
## [4] rstudioapi_0.7 affyio_1.52.0 bit64_0.9-7
## [7] AnnotationDbi_1.44.0 mvtnorm_1.0-7 lubridate_1.7.4
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.2
## [13] doParallel_1.0.14 impute_1.56.0 robustbase_0.93-0
## [16] knitr_1.22 Formula_1.2-3 jsonlite_1.6
## [19] Cairo_1.5-9 annotate_1.60.1 broom_0.5.1
## [22] cluster_2.0.7-1 GO.db_3.7.0 shiny_1.2.0
## [25] BiocManager_1.30.4 rrcov_1.4-3 compiler_3.5.2
## [28] httr_1.4.0 backports_1.1.2 assertthat_0.2.1
## [31] Matrix_1.2-15 lazyeval_0.2.2 cli_1.1.0
## [34] later_0.8.0 acepack_1.4.1 htmltools_0.3.6
## [37] prettyunits_1.0.2 tools_3.5.2 gtable_0.2.0
## [40] affy_1.60.0 Rcpp_1.0.1 cellranger_1.1.0
## [43] preprocessCore_1.44.0 crosstalk_1.0.0 iterators_1.0.9
## [46] xfun_0.5 rvest_0.3.2 mime_0.6
## [49] statmod_1.4.30 XML_3.98-1.11 DEoptimR_1.0-8
## [52] MASS_7.3-51.1 zlibbioc_1.28.0 scales_1.0.0
## [55] promises_1.0.1 hms_0.4.2 RColorBrewer_1.1-2
## [58] curl_3.3 yaml_2.2.0 memoise_1.1.0
## [61] rpart_4.1-13 latticeExtra_0.6-28 stringi_1.4.3
## [64] RSQLite_2.1.1 S4Vectors_0.20.1 pcaPP_1.9-73
## [67] foreach_1.4.4 checkmate_1.8.5 rlang_0.3.2
## [70] pkgconfig_2.0.2 matrixStats_0.54.0 bitops_1.0-6
## [73] evaluate_0.13 lattice_0.20-38 labeling_0.3
## [76] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [79] robust_0.4-18 plyr_1.8.4 magrittr_1.5
## [82] R6_2.4.0 IRanges_2.16.0 generics_0.0.2
## [85] Hmisc_4.1-1 fit.models_0.5-14 DBI_1.0.0
## [88] pillar_1.3.1 haven_1.1.1 foreign_0.8-71
## [91] withr_2.1.2 survival_2.43-3 RCurl_1.95-4.10
## [94] nnet_7.3-12 modelr_0.1.4 crayon_1.3.4
## [97] rmarkdown_1.12 progress_1.2.0 locfit_1.5-9.1
## [100] grid_3.5.2 readxl_1.1.0 data.table_1.12.0
## [103] blob_1.1.1 digest_0.6.18 xtable_1.8-3
## [106] httpuv_1.5.0 stats4_3.5.2 munsell_0.5.0